library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(gvlma)
library(car)
## Loading required package: carData

We are going to look at the property prices for Orange County. From there we are going to try and predict the sales price by linear regression.

1 Importing Data

Bring in the data and make sure the data types are correct. If not, make the proper changes. The file is located within this project. data/prop_prices_reduced.csv

prices <- read.csv("data\\prop_prices_reduced.csv")
str(prices)
## 'data.frame':    1000 obs. of  8 variables:
##  $ sale_def   : int  88142 78046 273777 229185 464029 109152 190090 259402 139069 204245 ...
##  $ bed        : int  4 2 4 4 4 3 1 3 3 4 ...
##  $ bath       : num  2 2 2 3 3 2 1 2 2.5 2 ...
##  $ area_heated: int  1270 1037 2821 2341 2981 1307 960 2010 1873 1701 ...
##  $ area       : int  3631 1307 10309 9578 12287 10088 269496 17270 9858 5658 ...
##  $ dist_cbd   : num  5212 14707 20898 7057 7772 ...
##  $ dist_lakes : num  639 199 3799 193 226 ...
##  $ pool       : int  0 0 0 0 1 1 0 0 0 0 ...

2 Plotting

Plot histograms for all variables. Additionally, add scatterplots for the relationships between all quantitative variables.

hist(prices$sale_def)

hist(prices$bed)

hist(prices$bath)

hist(prices$area_heated)

hist(prices$area)

hist(prices$dist_cbd)

hist(prices$dist_lakes)

hist(prices$pool)

plot_ly(prices, y = ~sale_def, x = ~bed, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~area_heated, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~area, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~bath, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~dist_lakes, x = ~area, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~area_heated, x = ~bath, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~area_heated, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~dist_cbd, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

3 Summary Statistics

Provide basic summary statistics for univariate analysis. Also, provide the correlation between all the quantitative variables.

data <- prices$sale_def
mean(data)
## [1] 200319.3
median(data)
## [1] 151085
max(data)
## [1] 7629992
min(data)
## [1] 15296
# What about the summary stats for the rest of the variables?
# What about sd/var?

quant_var <- data.frame(prices$sale_def,prices$bed,prices$bath,prices$area_heated,prices$area,
                        prices$dist_cbd,prices$dist_lakes)
cor(quant_var)
##                    prices.sale_def prices.bed prices.bath prices.area_heated
## prices.sale_def         1.00000000 0.32557109  0.58731596         0.69201080
## prices.bed              0.32557109 1.00000000  0.64449531         0.66911599
## prices.bath             0.58731596 0.64449531  1.00000000         0.83255359
## prices.area_heated      0.69201080 0.66911599  0.83255359         1.00000000
## prices.area             0.34354392 0.10437035  0.19298319         0.30184325
## prices.dist_cbd         0.05263700 0.23328277  0.24220637         0.25494517
## prices.dist_lakes      -0.08857844 0.03747888 -0.02109154        -0.04092947
##                    prices.area prices.dist_cbd prices.dist_lakes
## prices.sale_def     0.34354392      0.05263700       -0.08857844
## prices.bed          0.10437035      0.23328277        0.03747888
## prices.bath         0.19298319      0.24220637       -0.02109154
## prices.area_heated  0.30184325      0.25494517       -0.04092947
## prices.area         1.00000000      0.08648327       -0.15432291
## prices.dist_cbd     0.08648327      1.00000000        0.26520451
## prices.dist_lakes  -0.15432291      0.26520451        1.00000000

4 Regression Analysis

Run a regression with all the variables included. Print results of the regression.

model <- lm(sale_def ~ bed + bath + area_heated + area + 
              dist_cbd + dist_lakes + pool, data = prices)
summary(model)
## 
## Call:
## lm(formula = sale_def ~ bed + bath + area_heated + area + dist_cbd + 
##     dist_lakes + pool, data = prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -765238  -55999    9774   63848 4954827 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.890e+04  3.093e+04  -1.904 0.057170 .  
## bed         -8.492e+04  1.087e+04  -7.810 1.45e-14 ***
## bath         5.449e+04  1.507e+04   3.616 0.000315 ***
## area_heated  2.439e+02  1.430e+01  17.054  < 2e-16 ***
## area         3.548e+00  5.958e-01   5.955 3.61e-09 ***
## dist_cbd    -5.848e+00  1.090e+00  -5.366 1.00e-07 ***
## dist_lakes   9.651e-01  3.810e+00   0.253 0.800091    
## pool        -2.776e+04  1.640e+04  -1.693 0.090787 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194900 on 992 degrees of freedom
## Multiple R-squared:  0.547,  Adjusted R-squared:  0.5438 
## F-statistic: 171.2 on 7 and 992 DF,  p-value: < 2.2e-16

Which of the variables tested significant at the 95% level? Looking at the results and answering outside of the chunk is sufficient. Answer: bed, bath, area_heated, area, dist_cbd

4.1 Evaluating the model

As is, are any of the Gauss-Markov assumptions violated? If so, which ones? How can you fix the issues?

gvmodel <- gvlma(model)
summary(gvmodel)
## 
## Call:
## lm(formula = sale_def ~ bed + bath + area_heated + area + dist_cbd + 
##     dist_lakes + pool, data = prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -765238  -55999    9774   63848 4954827 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.890e+04  3.093e+04  -1.904 0.057170 .  
## bed         -8.492e+04  1.087e+04  -7.810 1.45e-14 ***
## bath         5.449e+04  1.507e+04   3.616 0.000315 ***
## area_heated  2.439e+02  1.430e+01  17.054  < 2e-16 ***
## area         3.548e+00  5.958e-01   5.955 3.61e-09 ***
## dist_cbd    -5.848e+00  1.090e+00  -5.366 1.00e-07 ***
## dist_lakes   9.651e-01  3.810e+00   0.253 0.800091    
## pool        -2.776e+04  1.640e+04  -1.693 0.090787 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194900 on 992 degrees of freedom
## Multiple R-squared:  0.547,  Adjusted R-squared:  0.5438 
## F-statistic: 171.2 on 7 and 992 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model) 
## 
##                        Value p-value                   Decision
## Global Stat        7497980.4       0 Assumptions NOT satisfied!
## Skewness             45659.6       0 Assumptions NOT satisfied!
## Kurtosis           7450926.5       0 Assumptions NOT satisfied!
## Link Function          782.2       0 Assumptions NOT satisfied!
## Heteroscedasticity     612.1       0 Assumptions NOT satisfied!

I do not see explanations about the violations of the assumptions

4.2 New Model

Based off of your findings in the previous section, make changes to the variables, the functional form, etc.

outlierTest(model)
##      rstudent unadjusted p-value Bonferroni p
## 37  57.115829        9.6866e-316  9.6866e-313
## 7   -6.567097         8.2674e-11   8.2674e-08
## 214  4.529055         6.6453e-06   6.6453e-03
newdata <- prices[-c(7, 37, 214), ]

newmodel <- lm(sale_def ~ bed + bath + area_heated + area + 
              dist_cbd + dist_lakes + pool, data = newdata)

Missing some sort of transformations to help.

5 Prediction

Based on the following inputs, predict the deflated sales price:

  • 2 bed
  • 2 bath
  • area_heated = 1223
  • area = 9750
  • dist_cbd = 19368
  • dist_lakes = 490
  • no pool
Example1 <- data.frame(bed = 2,
                   bath = 2.0,
                   area_heated = 1223,
                   area = 9750,
                   dist_cbd = 19368,
                   dist_lakes = 490,
                   pool = 0)
                   

predict(newmodel, Example1)
##        1 
## 117543.7